suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(RColorBrewer)
library(scatterplot3d)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv(here("doc/variables.csv"),
col_types=cols(
col_character(),
col_factor(),
col_factor(),
col_factor(),
col_factor()
))
tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.txt"),delim="\t",
col_names=c("TXID","GENE")))
filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
names(filelist) <- sub("_S\\d+.*","",basename(dirname(filelist)))
stopifnot(all(samples$ID==names(filelist)))
Read the expression at the gene level
This gives us directly gene counts
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",
tx2gene=tx2gene)$counts))
This is for transcript counts
tx <- suppressMessages(tximport(files = filelist,
type = "salmon",
txOut = TRUE))
tx.counts <- round(tx$counts)
# counts <- summarizeToGene(tx,tx2gene=tx2gene)
sel <- rowSums(tx.counts) == 0
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
round(sum(sel) * 100/ nrow(tx.counts),digits=1),
sum(sel),
nrow(tx.counts))
## [1] "14.9% percent (7940) of 53467 transcripts are not expressed"
dat <- tibble(x=colnames(tx.counts),y=colSums(tx.counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=TISSUE)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
i.e. the mean raw count of every transcript across samples is calculated and displayed on a log10 scale.
The cumulative transcript coverage is as expected
ggplot(data.frame(value=log10(rowMeans(tx.counts))),aes(x=value)) +
geom_density() + ggtitle("transcript mean raw tx.counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 7940 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by TISSUE
dat <- as.data.frame(log10(tx.counts)) %>% utils::stack() %>%
mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>%
mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>%
mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])
ggplot(dat,aes(x=values,group=ind,col=TISSUE)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per transcript raw counts (log10)")
## Warning: Removed 661300 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write_csv(as.data.frame(tx.counts) %>% rownames_to_column("ID"),
here("data/analysis/salmon/raw-unormalised-transcript-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromMatrix(
countData = tx.counts,
colData = samples,
design = ~ TISSUE * CONDITION)
## converting counts to integer mode
dds <- dds[,-match("P14065_119",colnames(dds))]
save(dds,file=here("data/analysis/salmon/dds-transcripts.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P14065_101 | P14065_102 | P14065_103 | P14065_104 | P14065_105 | P14065_106 |
|---|---|---|---|---|---|
| 0.9059 | 0.5975 | 0.7959 | 0.7663 | 1.074 | 0.9132 |
| P14065_107 | P14065_108 | P14065_109 | P14065_110 | P14065_111 | P14065_112 |
|---|---|---|---|---|---|
| 1.131 | 0.5848 | 0.7795 | 0.7331 | 0.4832 | 1.034 |
| P14065_113 | P14065_114 | P14065_115 | P14065_116 | P14065_117 | P14065_118 |
|---|---|---|---|---|---|
| 1.043 | 0.9788 | 0.6982 | 1.778 | 1.527 | 0.9935 |
| P14065_120 | P14065_121 | P14065_122 | P14065_123 | P14065_124 | P14065_125 |
|---|---|---|---|---|---|
| 1.158 | 1.857 | 0.9216 | 0.9973 | 1.959 | 1.693 |
| P14065_126 | P14065_127 | P14065_128 | P14065_129 | P14065_130 |
|---|---|---|---|---|
| 1.079 | 1.081 | 1.135 | 1.317 | 1.151 |
boxplot(sizes, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="grey")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
An the number of possible combinations
nlevel=nlevels(dds$TISSUE) * nlevels(dds$CONDITION)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
The PCA shows that a large fraction of the variance is explained by both variables.
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(dds$CONDITION)],
pch=c(17,19)[as.integer(dds$TISSUE)])
legend("topleft",
fill=pal[1:nlevels(dds$CONDITION)],
legend=levels(dds$CONDITION))
legend("topright",
pch=c(17,19),
legend=levels(dds$TISSUE))
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
The most variance comes from the tissues. In the leaf, the time has more effect than the treatment, while this look the opposite in the apex.
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=CONDITION,shape=TISSUE,text=ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized tx.counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise
conds <- factor(paste(dds$TISSUE,dds$CONDITION))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=2)
vst.cutoff <- 2
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="")
The main differences in the leaf samples are according to the days of sampling. There are no obvious differences at day 3 between treatments but it looks different at day 1 with higher differences. In the case of the apex, the bigger differences appear at day 3. There is a clear separation at day 1 too but day 0 appear grouped close to at 1
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.54.0 tximport_1.14.0
## [3] forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.4 purrr_0.3.3
## [7] readr_1.3.1 tidyr_1.0.2
## [9] tibble_2.1.3 tidyverse_1.3.0
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] plotly_4.9.2 pander_0.6.3
## [15] hyperSpec_0.99-20200213 xml2_1.2.2
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] here_0.1 gplots_3.0.1.2
## [21] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [23] DelayedArray_0.12.2 BiocParallel_1.20.1
## [25] matrixStats_0.55.0 Biobase_2.46.0
## [27] GenomicRanges_1.38.0 GenomeInfoDb_1.22.0
## [29] IRanges_2.20.2 S4Vectors_0.24.3
## [31] BiocGenerics_0.32.0 data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5 Hmisc_4.3-1
## [4] lazyeval_0.2.2 splines_3.6.2 crosstalk_1.0.0
## [7] digest_0.6.24 htmltools_0.4.0 gdata_2.18.0
## [10] fansi_0.4.1 magrittr_1.5 checkmate_2.0.0
## [13] memoise_1.1.0 cluster_2.1.0 limma_3.42.2
## [16] annotate_1.64.0 modelr_0.1.5 jpeg_0.1-8.1
## [19] colorspace_1.4-1 blob_1.2.1 rvest_0.3.5
## [22] haven_2.2.0 xfun_0.12 crayon_1.3.4
## [25] RCurl_1.98-1.1 jsonlite_1.6.1 hexbin_1.28.1
## [28] genefilter_1.68.0 survival_3.1-8 glue_1.3.1
## [31] gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
## [34] scales_1.1.0 DBI_1.1.0 Rcpp_1.0.3
## [37] viridisLite_0.3.0 xtable_1.8-4 htmlTable_1.13.3
## [40] foreign_0.8-75 bit_1.1-15.2 preprocessCore_1.48.0
## [43] Formula_1.2-3 htmlwidgets_1.5.1 httr_1.4.1
## [46] acepack_1.4.1 pkgconfig_2.0.3 XML_3.99-0.3
## [49] farver_2.0.3 nnet_7.3-12 dbplyr_1.4.2
## [52] locfit_1.5-9.1 tidyselect_1.0.0 labeling_0.3
## [55] rlang_0.4.4 later_1.0.0 AnnotationDbi_1.48.0
## [58] munsell_0.5.0 cellranger_1.1.0 tools_3.6.2
## [61] cli_2.0.1 generics_0.0.2 RSQLite_2.2.0
## [64] broom_0.5.4 fastmap_1.0.1 evaluate_0.14
## [67] yaml_2.2.1 knitr_1.28 bit64_0.9-7
## [70] fs_1.3.1 caTools_1.18.0 nlme_3.1-144
## [73] mime_0.9 compiler_3.6.2 rstudioapi_0.11
## [76] png_0.1-7 testthat_2.3.1 affyio_1.56.0
## [79] reprex_0.3.0 geneplotter_1.64.0 stringi_1.4.5
## [82] highr_0.8 Matrix_1.2-18 vctrs_0.2.2
## [85] pillar_1.4.3 lifecycle_0.1.0 BiocManager_1.30.10
## [88] bitops_1.0-6 httpuv_1.5.2 R6_2.4.1
## [91] latticeExtra_0.6-29 affy_1.64.0 promises_1.1.0
## [94] KernSmooth_2.23-16 gridExtra_2.3 gtools_3.8.1
## [97] assertthat_0.2.1 rprojroot_1.3-2 withr_2.1.2
## [100] GenomeInfoDbData_1.2.2 hms_0.5.3 rpart_4.1-15
## [103] rmarkdown_2.1 shiny_1.4.0 lubridate_1.7.4
## [106] base64enc_0.1-3